##############################################################
### Project: Where do parties talk about what? Party issue ###
###          salience across communication channels        ###
### Task:    Validation - Parliamentary Speeches           ###
### Title:   Validation_ParlSpeeches.R                     ###
##############################################################

#---------------------------------------------------------------------------------
# Description:
#
# This script validates the classification model for parliamentary speeches in
# Austria, Germany and Switzerland against the manual coding.
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Preparation

## clear global environment
#rm(list = ls())

## load packages
library("tidyverse")
library("xlsx")
library("newsmap")

## load classification results

data <- read.csv("Results_ParlSpeeches.csv", encoding = "UTF-8")
data <- data[, 2:ncol(data)]

data$date <- as.Date(data$date)
data$issue <- gsub("_", " ", data$issue)

## load manual gold standard
df_manual <- readRDS("./ManualGoldStandard/ParlSpeeches/Sample_ParlSpeeches.RDS")

## create validation data set
df <- left_join(df_manual, data[, c(1, 32)],
                by=c("text_id"))

df[, 13:14][is.na(df[, 13:14])] <- "NA"

#---------------------------------------------------------------------------------------------------------------------

# Validation Statistics

## Accuracy
acc_parlspeeches <- round(length(which(df$issue == df$manual_coder))/nrow(df), digits = 2)

## Statistics (precision, recall, f1 scores)
df_acc <- accuracy(df$issue, df$manual_coder)

## Balanced accuracy
balacc_parlspeeches <- round(
  sum(df_acc$recall, na.rm = TRUE) / length(df_acc$recall),
  digits = 2)

## Macro average F1
f1_parlspeeches <- round(
  sum(df_acc$f1, na.rm = TRUE) / length(df_acc$f1),
  digits = 2)

## Statistics per country
df_AT <- filter(df, country == "AT")
acc_parlspeeches_AT <- round(length(which(df_AT$issue == df_AT$manual_coder))/nrow(df_AT), digits = 2)
df_acc_AT <- accuracy(df_AT$issue, df_AT$manual_coder)
balacc_parlspeeches_AT <- round(sum(df_acc_AT$recall, na.rm = TRUE) / length(df_acc_AT$recall), digits = 2)
f1_parlspeeches_AT <- round(sum(df_acc_AT$f1, na.rm = TRUE) / length(df_acc_AT$f1),digits = 2)

df_CH <- filter(df, country == "CH")
acc_parlspeeches_CH <- round(length(which(df_CH$issue == df_CH$manual_coder))/nrow(df_CH), digits = 2)
df_acc_CH <- accuracy(df_CH$issue, df_CH$manual_coder)
balacc_parlspeeches_CH <- round(sum(df_acc_CH$recall, na.rm = TRUE) / length(df_acc_CH$recall), digits = 2)
f1_parlspeeches_CH <- round(sum(df_acc_CH$f1, na.rm = TRUE) / length(df_acc_CH$f1),digits = 2)

df_DE <- filter(df, country == "DE")
acc_parlspeeches_DE <- round(length(which(df_DE$issue == df_DE$manual_coder))/nrow(df_DE), digits = 2)
df_acc_DE <- accuracy(df_DE$issue, df_DE$manual_coder)
balacc_parlspeeches_DE <- round(sum(df_acc_DE$recall, na.rm = TRUE) / length(df_acc_DE$recall), digits = 2)
f1_parlspeeches_DE <- round(sum(df_acc_DE$f1, na.rm = TRUE) / length(df_acc_DE$f1),digits = 2)

#---------------------------------------------------------------------------------------------------------------------
